library(tidyverse)
library(tigris)
library(sf)
library(usmap) # May need to install.packages to run this 
library(ggplot2)
library(dplyr)
library(forecast)
library(plotly)


knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)

theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)

scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

Downloading state shape file Data:

shape_files = usmap::us_map()

asthma_df = read_csv("data/asthma_data.csv")|>
<<<<<<< HEAD
  mutate(year= year_name)
## Rows: 559 Columns: 3
## ── Column specification ──────
## Delimiter: ","
## chr (1): state
## dbl (2): year_name, prevalence_percent
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weather_df = read_csv("data/temp_data.csv")
## Rows: 2193 Columns: 4
## ── Column specification ──────
## Delimiter: ","
## chr (2): state, season
## dbl (2): year, avg_temp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Merging Asthma & temp & shapefile data

======= mutate(year= year_name) weather_df = read_csv("data/temp_data.csv")

Merging Asthma & temp & shape file data:

>>>>>>> 5e131e6fb3b6434be0fb1a17d891441a44f7c21f
asthma_weather = 
  asthma_df |>
  left_join(weather_df, by = c("state", "year")) 


final_df =
  shape_files |>
  mutate(state = abbr) |>                           
  left_join(asthma_weather, by = "state") |>        
  drop_na()                                         

Mapping Prevalence data by state:

ggplot= 
  final_df|> 
  filter(year==2021)|> 
  ggplot() +
  geom_sf(aes(fill = prevalence_percent), color = "white") +
  scale_fill_viridis_c(na.value = "grey90") + 
  theme_minimal() +
  labs(
    title = "Adult Asthma Prevalence by State (2021)",
    fill = "Prevalence (%)"
  ) +
  theme(
    panel.grid = element_blank(), 
    axis.text = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank()
  )

<<<<<<< HEAD
final_df|> 
  filter(year==2012)|>
  ggplot() +
  geom_sf(aes(fill = prevalence_percent), color = "white") +
  scale_fill_viridis_c(na.value = "grey90") + 
  theme_minimal() +
  labs(
    title = "Asthma Prevalence by State (2012)",
    fill = "Prevalence (%)"
  ) +
  theme(
    panel.grid = element_blank(), 
    axis.text = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank()
  )

final_df|> 
  filter(year==2013)|>
  ggplot() +
  geom_sf(aes(fill = prevalence_percent), color = "white") +
  scale_fill_viridis_c(na.value = "grey90") + 
  theme_minimal() +
  labs(
    title = "Asthma Prevalence by State (2013)",
    fill = "Prevalence (%)"
  ) +
  theme(
    panel.grid = element_blank(), 
    axis.text = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank()
  )

plotly_map <- ggplotly(ggplot)

# Display the Plotly map
plotly_map

Do regions with higher asthma prevalence overlap with areas of lower average temperature?

Does analyzing prevalence data stratified by time and temperature make a difference?

======= ggplot|> ggplotly()

Goals of Data Visualization

We aim to display the distribution of asthma cases from 2001-2021 using box or violin plots to guide the direction of our analysis to plot weather temperatures in these states with the most density of asthma cases. We intend to draw insight for the following questions regarding our hypothesis:

Asthma Trend Across the Us over time:

aggregated_data=
  final_df|>
  group_by(year)|>
  summarise(avg_prevalence = mean(prevalence_percent, na.rm = TRUE))

aggregated_data |>
  ggplot(aes(x = year, y = avg_prevalence)) +
  geom_line() +  # Line for the time series
  geom_point(color = "red") +  # Scatter points
  labs(
    title = "Adult Asthma Trend Across the US Over Time",
    x = "Year",
    y = "Average Asthma Prevalence (%)"
  ) +
  theme_minimal()

Distribution of Asthma across states:

The box plot method provided a clearer visual representation compared to the violin density plot, effectively illustrating the distribution of data and highlighting the concentration of values within each state.

ggplotly( 
  final_df|>
  group_by(full)|>
  ggplot(aes(x= reorder(full,prevalence_percent), y= prevalence_percent))+ 
  geom_boxplot()+
  labs(title= "Distribution of Adult Asthma across states" )+
  xlab("State")+
  ylab("Asthma Prevalance (%)")+
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ))

This plot shows the distribution of asthma across states across all years 2011-2021. States located further right on the plot are shown to have higher asthma prevalence by percent compared to other states. States located in the upper North East such as Maine, Rhode Island, Vermont, and New Hampshire had consistently higher asthma prevalence from 2011-2021 compared to other states. States with some of the lowest asthma prevalence (%) were Texas, South Dakota, Florida, Nebraska, and Minnesota.

ggplotly( 
  final_df|> 
  group_by(full)|>
  ggplot(aes(x= reorder(full, avg_temp), y= avg_temp))+ 
  geom_boxplot()+
  labs(title= "Distribution of Temperature across states" )+
  xlab("State")+
  ylab("Temperature (C)")+
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ))

When comparing states with higher asthma prevalence to their temperature distributions, we observe that states such as Maine, Rhode Island, Vermont, and New Hampshire exhibit lower median average temperatures and larger interquartile ranges. This variation reflects the significant seasonal temperature fluctuations in these regions. Notably, when comparing the upper and lower 25% of temperature distributions in these states, the temperatures in the upper quartile are more variable. This variability is important to highlight, as previous studies have suggested that extreme temperatures may increase the risk of asthma exacerbation. Overall, varying temperature conditions (both high and low extremes) might influence asthma prevalence across states.

have temperatures changed overtime? -> t.test across years? heatmap?

Initial statistical anslysis (Idk if we should keep this part )

final_df= 
  final_df |>
  mutate(
    temp_category = case_when(
      avg_temp < 0 ~ "Cold",
      avg_temp >= 0 & avg_temp < 15 ~ "Mild",
      avg_temp >= 15 ~ "Warm"))|> 
  select(-c("abbr","fips", "year_name"))|>
  select(state,prevalence_percent,avg_temp, season,year,temp_category,everything())

final_df
## Simple feature collection with 2185 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -2590847 ymin: -2608148 xmax: 2523581 ymax: 731407.9
## Projected CRS: NAD27 / US National Atlas Equal Area
## # A tibble: 2,185 × 8
##    state prevalence_percent avg_temp season  year temp_category full  
##    <chr>              <dbl>    <dbl> <chr>  <dbl> <chr>         <chr> 
##  1 AK                   8.2    1.83  Fall    2011 Mild          Alaska
##  2 AK                   8.2    2.12  Spring  2011 Mild          Alaska
##  3 AK                   8.2   11.9   Summer  2011 Mild          Alaska
##  4 AK                   8.2   -5.34  Winter  2011 Cold          Alaska
##  5 AK                   9      1.17  Fall    2012 Mild          Alaska
##  6 AK                   9      1.49  Spring  2012 Mild          Alaska
##  7 AK                   9     11.8   Summer  2012 Mild          Alaska
##  8 AK                   9     -8.34  Winter  2012 Cold          Alaska
##  9 AK                   9.3    3.55  Fall    2013 Mild          Alaska
## 10 AK                   9.3    0.749 Spring  2013 Mild          Alaska
## # ℹ 2,175 more rows
## # ℹ 1 more variable: geom <MULTIPOLYGON [m]>

Does Correlation differ by state?

cor_results= 
  final_df |>
  group_by(state) |>
  summarise(
    cor_test = list(cor.test(avg_temp, prevalence_percent)),
    .groups = "drop")|>
  rowwise()|>
  mutate(
    corr=cor_test[["estimate"]],
    p_value= cor_test[["p.value"]])|>
  ungroup()|>
  select(-cor_test)

cor_results
## Simple feature collection with 50 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -2590847 ymin: -2608148 xmax: 2523581 ymax: 731407.9
## Projected CRS: NAD27 / US National Atlas Equal Area
## # A tibble: 50 × 4
##    state                                                   geom     corr p_value
##    <chr>                                     <MULTIPOLYGON [m]>    <dbl>   <dbl>
##  1 AK    (((-2396847 -2547721, -2393297 -2546391, -2391552 -25…  0.0578    0.709
##  2 AL    (((1093777 -1378535, 1093269 -1374223, 1092965 -13414…  0.0106    0.945
##  3 AR    (((483065.2 -927788.2, 506062 -926263.3, 531512.5 -92…  0.0142    0.927
##  4 AZ    (((-1388676 -1254584, -1389181 -1251856, -1384522 -12…  0.0613    0.693
##  5 CA    (((-1719946 -1090033, -1709611 -1090026, -1700882 -11… -0.00830   0.957
##  6 CO    (((-789538.7 -678773.8, -789538.2 -678769.5, -781696.… -0.0228    0.883
##  7 CT    (((2161733 -83737.52, 2177182 -65221.22, 2168999 -581…  0.0308    0.843
##  8 DE    (((2042506 -284367.3, 2043078 -280000.3, 2044959 -275…  0.0219    0.888
##  9 FL    (((1855611 -2064809, 1860157 -2054372, 1867238 -20477… -0.0717    0.660
## 10 GA    (((1311336 -999180.8, 1323127 -997255.3, 1331180 -995… -0.00238   0.988
## # ℹ 40 more rows

Correlation is small across all states

Does the effect of prevalence differ by state:

state_trends= 
  final_df |>
  group_by(state) |>
  summarise(
    model = list(lm(prevalence_percent ~ year)))


state_trends= 
  state_trends |>
  mutate(
    model_summary = map(model, broom::tidy)) |>
  unnest(model_summary)|>
  filter(term == "year")|>
  select(
    state, 
    slope = estimate, 
    intercept = NULL, 
    p_value = p.value)|>
  st_drop_geometry()|>
  knitr::kable(digits=5)

state_trends
state slope p_value
AK 0.06182 0.00868
AL 0.16455 0.00001
AR 0.02909 0.25949
AZ 0.06091 0.00049
CA 0.02636 0.29868
CO 0.16364 0.00000
CT 0.09364 0.00000
DE 0.04727 0.16551
FL -0.05152 0.09681
GA 0.00455 0.86590
HI -0.06182 0.04633
IA 0.08273 0.00179
ID 0.10000 0.00000
IL 0.03455 0.07010
IN 0.02182 0.28358
KS 0.18364 0.00000
KY 0.05545 0.17836
LA 0.19909 0.00000
MA -0.00909 0.76559
MD 0.04636 0.00314
ME 0.01455 0.59673
MI 0.09091 0.00002
MN 0.11364 0.00000
MO -0.05727 0.01216
MS 0.22636 0.00000
MT 0.11455 0.00003
NC 0.03455 0.19492
ND 0.03182 0.09917
NE 0.10545 0.00000
NH 0.14909 0.00023
NJ -0.00400 0.87883
NM 0.04292 0.17131
NV 0.19000 0.00000
NY -0.01636 0.41568
OH 0.01727 0.47857
OK 0.12545 0.00000
OR 0.05091 0.01235
PA 0.10091 0.00000
RI 0.09364 0.00184
SC 0.12182 0.00000
SD 0.09909 0.00123
TN 0.30727 0.00000
TX 0.07182 0.00022
UT 0.14455 0.00000
VA 0.05636 0.00440
VT 0.05636 0.01882
WA 0.04818 0.00625
WI 0.10182 0.00312
WV 0.32091 0.00000
WY 0.09182 0.00036